{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
      "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
      "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
      "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
      "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Input data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_path = '../input/'\n",
    "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
    "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
    "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pre-processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户编码</th>\n",
       "      <th>用户实名制是否通过核实</th>\n",
       "      <th>用户年龄</th>\n",
       "      <th>是否大学生客户</th>\n",
       "      <th>是否黑名单客户</th>\n",
       "      <th>是否4G不健康客户</th>\n",
       "      <th>用户网龄（月）</th>\n",
       "      <th>用户最近一次缴费距今时长（月）</th>\n",
       "      <th>缴费用户最近一次缴费金额（元）</th>\n",
       "      <th>用户近6个月平均消费值（元）</th>\n",
       "      <th>...</th>\n",
       "      <th>当月是否景点游览</th>\n",
       "      <th>当月是否体育场馆消费</th>\n",
       "      <th>当月网购类应用使用次数</th>\n",
       "      <th>当月物流快递类应用使用次数</th>\n",
       "      <th>当月金融理财类应用使用总次数</th>\n",
       "      <th>当月视频播放类应用使用次数</th>\n",
       "      <th>当月飞机类应用使用次数</th>\n",
       "      <th>当月火车类应用使用次数</th>\n",
       "      <th>当月旅游资讯类应用使用次数</th>\n",
       "      <th>信用分</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a4651f98c82948b186bdcdc8108381b4</td>\n",
       "      <td>1</td>\n",
       "      <td>44</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>186</td>\n",
       "      <td>1</td>\n",
       "      <td>99.8</td>\n",
       "      <td>163.86</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>713</td>\n",
       "      <td>0</td>\n",
       "      <td>2740</td>\n",
       "      <td>7145</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>30</td>\n",
       "      <td>664</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 30 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                               用户编码  用户实名制是否通过核实  用户年龄  是否大学生客户  是否黑名单客户  \\\n",
       "0  a4651f98c82948b186bdcdc8108381b4            1    44        0        0   \n",
       "\n",
       "   是否4G不健康客户  用户网龄（月）  用户最近一次缴费距今时长（月）  缴费用户最近一次缴费金额（元）  用户近6个月平均消费值（元） ...   \\\n",
       "0          0      186                1             99.8          163.86 ...    \n",
       "\n",
       "   当月是否景点游览  当月是否体育场馆消费  当月网购类应用使用次数  当月物流快递类应用使用次数  当月金融理财类应用使用总次数  \\\n",
       "0         1           1          713              0            2740   \n",
       "\n",
       "   当月视频播放类应用使用次数  当月飞机类应用使用次数  当月火车类应用使用次数  当月旅游资讯类应用使用次数  信用分  \n",
       "0           7145            0            0             30  664  \n",
       "\n",
       "[1 rows x 30 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
      "       '用户网龄（月）', '用户最近一次缴费距今时长（月）', '缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',\n",
      "       '用户账单当月总费用（元）', '用户当月账户余额（元）', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
      "       '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
      "       '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
      "       '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
      "       '当月旅游资讯类应用使用次数', '信用分'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "#all chinese name- -\n",
    "#rename one by one\n",
    "print(train_data.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
    "                     '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
    "                     'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
    "                     'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
    "                     'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
    "                     'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
    "                     'finance_app_count','video_app_count','flight_count','train_count',\\\n",
    "                     'tour_app_count','score']\n",
    "test_data.columns = train_data.columns[:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  # This is added back by InteractiveShellApp.init_path()\n"
     ]
    }
   ],
   "source": [
    "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
    "#先前余额，当前余额 + 当月话费 - 上次缴费 --- useless\n",
    "#充值金额/余额 --- useless\n",
    "#当月话费/最近充值金额 --- useless\n",
    "#六个月均值/充值金额 --- useless\n",
    "\n",
    "#top up amount, 充值金额是整数，和小数，应该对应不同的充值途径？\n",
    "\n",
    "def produce_offline_feat(train_data):\n",
    "    train_data['top_up_amount_offline'] = 0\n",
    "    train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
    "                               train_data['top_up_amount'] != 0] = 1\n",
    "    return train_data\n",
    "\n",
    "train_data = produce_offline_feat(train_data)\n",
    "test_data = produce_offline_feat(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def produce_fee_rate(train_data):\n",
    "    #看importance，当月话费 和最近半年平均话费都很高，算一下当月/半年 -->稳定性\n",
    "    train_data['current_fee_stability'] = \\\n",
    "    train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
    "    \n",
    "    #当月话费/当月账户余额\n",
    "    train_data['use_left_rate'] = \\\n",
    "    train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
    "    return train_data\n",
    "\n",
    "train_data = produce_fee_rate(train_data)\n",
    "test_data = produce_fee_rate(test_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def display_importances(feature_importance_df_):\n",
    "    cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
    "    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
    "    plt.figure(figsize=(8, 10))\n",
    "    sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
    "    plt.title('LightGBM Features (avg over folds)')\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#para\n",
    "params = {\n",
    "    'learning_rate': 0.01,\n",
    "    'boosting_type': 'gbdt',\n",
    "    'objective': 'regression_l1',\n",
    "    'metric': 'mae',\n",
    "    'feature_fraction': 0.6,\n",
    "    'bagging_fraction': 0.8,\n",
    "    'bagging_freq': 2,\n",
    "    'num_leaves': 31,\n",
    "    'verbose': -1,\n",
    "    'max_depth': 5,\n",
    "    'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#para\n",
    "params2 = {\n",
    "    'learning_rate': 0.01,\n",
    "    'boosting_type': 'gbdt',\n",
    "    'objective': 'regression_l2',\n",
    "    'metric': 'mae',\n",
    "    'feature_fraction': 0.6,\n",
    "    'bagging_fraction': 0.8,\n",
    "    'bagging_freq': 2,\n",
    "    'num_leaves': 31,\n",
    "    'verbose': -1,\n",
    "    'max_depth': 5,\n",
    "    'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8,\n",
    "    'seed': 89\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold:  0  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2433]\tvalid_0's l1: 14.7441\n",
      "fold:  1  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[1876]\tvalid_0's l1: 14.8595\n",
      "fold:  2  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2459]\tvalid_0's l1: 14.7082\n",
      "fold:  3  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2468]\tvalid_0's l1: 14.6564\n",
      "fold:  4  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2599]\tvalid_0's l1: 14.5114\n",
      "fold:  0  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[3313]\tvalid_0's l1: 14.743\n",
      "fold:  1  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2590]\tvalid_0's l1: 14.8562\n",
      "fold:  2  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2523]\tvalid_0's l1: 14.5752\n",
      "fold:  3  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[3564]\tvalid_0's l1: 14.6125\n",
      "fold:  4  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[1853]\tvalid_0's l1: 14.6333\n",
      "fold:  0  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2851]\tvalid_0's l1: 14.9587\n",
      "fold:  1  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[1875]\tvalid_0's l1: 14.7808\n",
      "fold:  2  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2957]\tvalid_0's l1: 14.5525\n",
      "fold:  3  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2723]\tvalid_0's l1: 14.4804\n",
      "fold:  4  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[3311]\tvalid_0's l1: 14.6854\n"
     ]
    }
   ],
   "source": [
    "cv_pred_all = 0\n",
    "en_amount = 3\n",
    "for seed in range(en_amount):\n",
    "    NFOLDS = 5\n",
    "    train_label = train_data['score']\n",
    "    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)\n",
    "    kf = kfold.split(train_data, train_label)\n",
    "\n",
    "    train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
    "    test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
    "\n",
    "\n",
    "    cv_pred = np.zeros(test_data.shape[0])\n",
    "    valid_best_l2_all = 0\n",
    "\n",
    "    feature_importance_df = pd.DataFrame()\n",
    "    count = 0\n",
    "    for i, (train_fold, validate) in enumerate(kf):\n",
    "        print('fold: ',i, ' training')\n",
    "        X_train, X_validate, label_train, label_validate = \\\n",
    "        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
    "        train_label[train_fold], train_label[validate]\n",
    "        dtrain = lgb.Dataset(X_train, label_train)\n",
    "        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
    "        bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
    "        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
    "        valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
    "\n",
    "#         fold_importance_df = pd.DataFrame()\n",
    "#         fold_importance_df[\"feature\"] = list(X_train.columns)\n",
    "#         fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
    "#         fold_importance_df[\"fold\"] = count + 1\n",
    "#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
    "        count += 1\n",
    "\n",
    "    cv_pred /= NFOLDS\n",
    "    valid_best_l2_all /= NFOLDS\n",
    "    \n",
    "    cv_pred_all += cv_pred\n",
    "cv_pred_all /= en_amount\n",
    "    #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold:  0  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2457]\tvalid_0's l1: 14.7871\n",
      "fold:  1  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2365]\tvalid_0's l1: 14.6983\n",
      "fold:  2  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2082]\tvalid_0's l1: 14.7999\n",
      "fold:  3  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2266]\tvalid_0's l1: 14.483\n",
      "fold:  4  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2046]\tvalid_0's l1: 14.7681\n",
      "fold:  0  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2436]\tvalid_0's l1: 14.7728\n",
      "fold:  1  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2053]\tvalid_0's l1: 14.8066\n",
      "fold:  2  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2221]\tvalid_0's l1: 14.5464\n",
      "fold:  3  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2348]\tvalid_0's l1: 14.5198\n",
      "fold:  4  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2207]\tvalid_0's l1: 14.8169\n",
      "fold:  0  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2110]\tvalid_0's l1: 14.5323\n",
      "fold:  1  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2627]\tvalid_0's l1: 14.8493\n",
      "fold:  2  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2040]\tvalid_0's l1: 14.8335\n",
      "fold:  3  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2241]\tvalid_0's l1: 14.6379\n",
      "fold:  4  training\n",
      "Training until validation scores don't improve for 50 rounds.\n",
      "Early stopping, best iteration is:\n",
      "[2424]\tvalid_0's l1: 14.6794\n"
     ]
    }
   ],
   "source": [
    "cv_pred_all2 = 0\n",
    "en_amount = 3\n",
    "for seed in range(en_amount):\n",
    "    NFOLDS = 5\n",
    "    train_label = train_data['score']\n",
    "    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))\n",
    "    kf = kfold.split(train_data, train_label)\n",
    "\n",
    "    train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
    "    test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
    "\n",
    "\n",
    "    cv_pred = np.zeros(test_data.shape[0])\n",
    "    valid_best_l2_all = 0\n",
    "\n",
    "    feature_importance_df = pd.DataFrame()\n",
    "    count = 0\n",
    "    for i, (train_fold, validate) in enumerate(kf):\n",
    "        print('fold: ',i, ' training')\n",
    "        X_train, X_validate, label_train, label_validate = \\\n",
    "        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
    "        train_label[train_fold], train_label[validate]\n",
    "        dtrain = lgb.Dataset(X_train, label_train)\n",
    "        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
    "        bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
    "        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
    "        valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
    "\n",
    "#         fold_importance_df = pd.DataFrame()\n",
    "#         fold_importance_df[\"feature\"] = list(X_train.columns)\n",
    "#         fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
    "#         fold_importance_df[\"fold\"] = count + 1\n",
    "#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
    "        count += 1\n",
    "\n",
    "    cv_pred /= NFOLDS\n",
    "    valid_best_l2_all /= NFOLDS\n",
    "    \n",
    "    cv_pred_all2 += cv_pred\n",
    "    \n",
    "cv_pred_all2 /= en_amount\n",
    "    #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# display_importances(feature_importance_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "baseline\n",
    "cv score for valid is:  0.06342856152921912 --- 0.06339265000\n",
    "    \n",
    "#充值金额是否为整数\n",
    "cv score for valid is:  0.06343660584697094\n",
    "#当月话费/半年话费\n",
    "cv score for valid is:  0.06349188259250227\n",
    "#当月话费/余额\n",
    "cv score for valid is:  0.06350638782547711\n",
    "    \n",
    "#leaves 31\n",
    "cv score for valid is:  0.06354362406472286\n",
    "#remove l1, l2 = 5\n",
    "cv score for valid is:  0.06358730556250403\n",
    "#feature fraction 0.7\n",
    "cv score for valid is:  0.06361478051326884 --- 0.06355141000\n",
    "max_depth 5, objective l1\n",
    "cv score for valid is:  0.06367445081783887\n",
    "feature fraction 0.6\n",
    "cv score for valid is:  0.06377264215140695 --- 0.06379867000\n",
    "10 fold\n",
    "cv score for valid is:  0.0637915578042461 --- 6378 --- useless\n",
    "remove blk list flag\n",
    "cv score for valid is:  0.06377613710442855"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Submit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n",
      "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  after removing the cwd from sys.path.\n",
      "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \"\"\"\n"
     ]
    }
   ],
   "source": [
    "test_data_sub = test_data[['uid']]\n",
    "test_data_sub['score'] = (cv_pred_all2 + cv_pred_all)/2\n",
    "test_data_sub.columns = ['id','score']\n",
    "test_data_sub['score1'] = cv_pred_all\n",
    "test_data_sub['score2'] = cv_pred_all2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_data_sub[['id','score']].to_csv('../output/baseline_6377_mae_mse_mean_6bagging.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "617.8386873193765"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#mean is: 1/(0.00161593) - 1, --- 617.8386873193765\n",
    "#std is around: 1/(0.02869282) - 1, --- 33.851924627833725"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
